Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 30112
## Number of samples: 80 (45 ASD, 35 controls)
plot_data = data.frame('id'=rownames(datExpr), 'mean_expression' = rowMeans(datExpr),
'SFARI_score'=DE_info$`gene-score`!='None',
'DExpressed'=DE_info$padj<0.05 & abs(DE_info$log2FoldChange)>log2(1.2))
ggplotly(plot_data %>% ggplot(aes(x=mean_expression, fill=DExpressed, color=DExpressed)) + geom_density(alpha=0.3) +
scale_x_log10() + ggtitle('Probe Mean Expression distribution') + theme_minimal())
We lose almost all of the probes with SFARI score
plot_data_SFARI = plot_data %>% filter(SFARI_score)
ggplotly(plot_data_SFARI %>% ggplot(aes(x=mean_expression, fill=DExpressed, color=DExpressed)) + geom_density(alpha=0.3) +
ggtitle('Probe Mean Expression distribution for SFARI Genes') + theme_minimal())
print(paste0('Losing ', round(100*(1-mean(plot_data$DExpressed[plot_data$SFARI_score==TRUE])),1),
'% of the probes with SFARI score'))
## [1] "Losing 95.9% of the probes with SFARI score"
datExpr = datExpr[plot_data$DExpressed,]
DE_info = DE_info[plot_data$DExpressed,]
datProbes = datProbes[plot_data$DExpressed,]
rm(plot_data, plot_data_SFARI)
To make calculations more efficient, we can perform PCA and keep the first 19 principal components which explain over 98% of the total variance
pca = prcomp(datExpr)
datExpr_redDim = pca$x %>% data.frame %>% dplyr::select(PC1:PC19)
clusterings = list()
clusterings[['SFARI_score']] = DE_info$`gene-score`
names(clusterings[['SFARI_score']]) = rownames(DE_info)
clusterings[['SFARI_bool']] = DE_info$`gene-score`!='None'
names(clusterings[['SFARI_bool']]) = rownames(DE_info)
clusterings[['syndromic']] = DE_info$syndromic==1
names(clusterings[['syndromic']]) = rownames(DE_info)
set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=200, nstart=25,
algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=16 as best number of clusters. SFARI genes seem to group in the last two clusters
h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
abline(h=19.5, col='blue')
best_k = 16
SFARI and Neuronal related probes don’t seem to concentrate anywhere specific. Could be because there aren’t enough left to see a pattern
clusterings[['hc']] = cutree(h_clusts, best_k)
create_viridis_dict = function(){
min_score = clusterings[['SFARI_score']] %>% as.numeric %>% min(na.rm=TRUE)
max_score = clusterings[['SFARI_score']] %>% as.numeric %>% max(na.rm=TRUE)
viridis_score_cols = viridis(max_score - min_score + 1)
names(viridis_score_cols) = seq(min_score, max_score)
return(viridis_score_cols)
}
viridis_score_cols = create_viridis_dict()
dend_meta = DE_info[match(labels(h_clusts), DE_info$ID),] %>%
mutate('SFARI_score' = viridis_score_cols[`gene-score`], # Purple: 2, Yellow: 6
'SFARI_bool' = ifelse(`gene-score` != 'None', '#21908CFF', 'white'), # Acqua
'Syndromic' = ifelse(syndromic == T, 'orange', 'white'),
'Neuronal' = ifelse(ID %in% GO_neuronal$ID, '#666666','white')) %>%
dplyr::select(SFARI_score, SFARI_bool, Syndromic, Neuronal)
h_clusts %>% as.dendrogram %>% set('labels', rep('', nrow(datMeta))) %>%
set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)
Samples are grouped into two big clusters, and then many outliers.
*Output plots in folder
*The rest of the output plots can be found in the Data/Gandal/consensusClustering/probes_DE/l1 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance (keeping 98% of the variance)
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign genes to clusters with FDR<0.01 using the fdrtool package
ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F, verbose=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])
Leaves 73% of the probes without a cluster
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3 6
## 772 242 29 11 1
# ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) +
# geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
# theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
Trying again these time with all of the principal components and 40 clusters
ICA_output = pca$x %>% data.frame %>% runICA(nbComp=40, method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F, verbose=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
Doesn’t make a big difference, but it’s still better
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3
## 747 258 46 4
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])
# ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) +
# geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
# theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
The soft R-squared value starts in 0.56 but then decreases and doesn’t go up until the end, Highest value does not reach threshold
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 30, by=1))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.56800 1.3900 0.873 751.0 837.0 885
## 2 2 0.32000 0.6070 0.877 611.0 713.0 795
## 3 3 0.16700 0.3270 0.870 522.0 624.0 733
## 4 4 0.09170 0.2180 0.933 457.0 552.0 684
## 5 5 0.04370 0.1370 0.957 408.0 494.0 644
## 6 6 0.01240 0.0652 0.948 368.0 444.0 610
## 7 7 0.00105 0.0176 0.978 336.0 401.0 581
## 8 8 0.00153 -0.0203 0.947 308.0 364.0 555
## 9 9 0.01450 -0.0571 0.989 284.0 332.0 531
## 10 10 0.04330 -0.0973 0.979 264.0 303.0 510
## 11 11 0.07730 -0.1240 0.970 245.0 278.0 491
## 12 12 0.12800 -0.1500 0.951 229.0 255.0 473
## 13 13 0.15200 -0.1670 0.939 215.0 234.0 457
## 14 14 0.20900 -0.1960 0.896 202.0 216.0 442
## 15 15 0.25700 -0.2220 0.859 191.0 199.0 428
## 16 16 0.31500 -0.2450 0.792 180.0 183.0 414
## 17 17 0.37700 -0.2610 0.775 170.0 169.0 402
## 18 18 0.40100 -0.2770 0.761 161.0 157.0 390
## 19 19 0.45000 -0.2930 0.767 153.0 145.0 379
## 20 20 0.48800 -0.3110 0.773 146.0 135.0 369
## 21 21 0.53800 -0.3270 0.766 139.0 125.0 359
## 22 22 0.57500 -0.3420 0.766 132.0 116.0 350
## 23 23 0.59800 -0.3590 0.755 126.0 107.0 341
## 24 24 0.62300 -0.3700 0.759 121.0 99.8 332
## 25 25 0.64100 -0.3850 0.708 116.0 93.1 324
## 26 26 0.67100 -0.3950 0.729 111.0 87.1 316
## 27 27 0.69200 -0.4120 0.720 106.0 81.6 309
## 28 28 0.70500 -0.4250 0.711 102.0 76.2 302
## 29 29 0.67500 -0.4500 0.625 97.9 71.2 295
## 30 30 0.68700 -0.4670 0.630 94.1 66.5 288
network = datExpr_redDim %>% t %>% blockwiseModules(power=28, numericLabels=TRUE)
## mergeCloseModules: less than two proper modules.
## ..color levels are 0, 3
## ..there is nothing to merge.
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It leaves 172 genes without a cluster and classifies all the other genes in a single cluster
clusterings[['WGCNA']] %>% table
## .
## 0 1
## 172 883
The trajectory is a bit erratic. The lowest BIC is achieved at 20
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=40, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 20
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
Separating the two clouds of points into two clusters
intercept=4.5
slope=-0.01
manual_clusters = as.factor(as.numeric(slope*datExpr_redDim$PC1 + intercept > datExpr_redDim$PC2))
names(manual_clusters) = rownames(datExpr_redDim)
clusterings[['Manual']] = manual_clusters
datExpr_redDim %>% ggplot(aes(PC1, PC2, color=manual_clusters)) + geom_point(alpha=0.3) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
geom_abline(intercept=intercept, slope=slope, color='gray') +
theme_minimal() + ggtitle('PCA')
clusterings[['Manual']] %>% table
## .
## 0 1
## 34 1021
rm(intercept, slope, pca)
The blue cluster seems to have four gaussians in both Mean and SD and the salmon cluster has two in both.
manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd),
manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)
Separate probes into four and two Gaussians, respectively by their mean expression:
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
c1_mean = manual_clusters_data %>% filter(cluster==1) %>% dplyr::select(mean)
rownames(c1_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='1']
gmm_c1_mean = c1_mean %>% GMM(2)
c2_mean = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(mean)
rownames(c2_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_mean = c2_mean %>% GMM(4)
clusterings[['Manual_mean']] = as.character(clusterings[['Manual']])
clusterings[['Manual_mean']][clusterings[['Manual']]==0] = gmm_c1_mean$Log_likelihood %>%
apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual_mean']][clusterings[['Manual']]==1] = gmm_c2_mean$Log_likelihood %>%
apply(1, function(x) glue('2_',which.max(x)))
plot_gaussians = manual_clusters_data %>% ggplot(aes(x=mean)) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[1],
args=list(mean=gmm_c1_mean$centroids[1], sd=gmm_c1_mean$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[2],
args=list(mean=gmm_c1_mean$centroids[2], sd=gmm_c1_mean$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[3],
args=list(mean=gmm_c2_mean$centroids[1], sd=gmm_c2_mean$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[4],
args=list(mean=gmm_c2_mean$centroids[2], sd=gmm_c2_mean$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[5],
args=list(mean=gmm_c2_mean$centroids[3], sd=gmm_c2_mean$covariance_matrices[3])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[6],
args=list(mean=gmm_c2_mean$centroids[4], sd=gmm_c2_mean$covariance_matrices[4])) +
theme_minimal()
plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_mean']]))) +
geom_point() + theme_minimal()
grid.arrange(plot_gaussians, plot_points, ncol=2)
clusterings[['Manual_mean']] %>% table
## .
## 1_1 1_2 2_1 2_2 2_3 2_4
## 13 21 248 310 273 190
Separate clusters into two Gaussians per diagnosis by their sd:
n_clusters = 2
c1_sd = manual_clusters_data %>% filter(cluster==1) %>% dplyr::select(sd)
rownames(c1_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='1']
gmm_c1_sd = c1_sd %>% GMM(n_clusters)
c2_sd = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(sd)
rownames(c2_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_sd = c2_sd %>% GMM(n_clusters)
clusterings[['Manual_sd']] = as.character(clusterings[['Manual']])
clusterings[['Manual_sd']][clusterings[['Manual']]==0] = gmm_c1_sd$Log_likelihood %>%
apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual_sd']][clusterings[['Manual']]==1] = gmm_c2_sd$Log_likelihood %>%
apply(1, function(x) glue('2_',which.max(x)))
plot_gaussians = manual_clusters_data %>% ggplot(aes(x=sd)) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[1], #
args=list(mean=gmm_c1_sd$centroids[1], sd=gmm_c1_sd$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[2], #
args=list(mean=gmm_c1_sd$centroids[2], sd=gmm_c1_sd$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[3], #
args=list(mean=gmm_c2_sd$centroids[1], sd=gmm_c2_sd$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[4], #
args=list(mean=gmm_c2_sd$centroids[2], sd=gmm_c2_sd$covariance_matrices[2])) +
theme_minimal()
plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_sd']]))) +
geom_point() + theme_minimal()
grid.arrange(plot_gaussians, plot_points, ncol=2)
clusterings[['Manual_sd']] %>% table
## .
## 1_1 1_2 2_1 2_2
## 16 18 473 548
rm(c1_sd, c2_sd, gmm_c1_sd, gmm_c2_sd)
# Clean up the environment a bit
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network,
best_power, c, manual_clusters, manual_clusters_data, c1_sd, c2_sd, c1_mean, c2_mean,
gmm_c1_sd, gmm_c2_sd,gmm_c1_sd, gmm_c1_mean, gmm_c2_mean, p1, p2, pca_data_projection, dend_meta,
plot_gaussians, plot_points, n_clusters, viridis_score_cols, gg_colour_hue, create_viridis_dict)
Using Adjusted Rand Index:
Clusterings are not very similar except for K-Means and Hierarchical clustering
No clustering method resembles the SFARI scores at all
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, cluster_sim)
The simple clustering methods only consider the 1st component, dividing by vertical lines
WGCNA doesn’t work well (classifies almost everything as a single class)
SFARI genes seem to be everywhere
1st PC seems to reflect the average level of expression of the genes
There seems to be a change in behaviour around PC1=0 (CC and WGCNA)
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), k_means = as.factor(clusterings[['km']]),
hc = as.factor(clusterings[['hc']]), cc = as.factor(clusterings[['cc_l1']]),
ica = as.factor(clusterings[['ICA_min']]),
n_ica = as.factor(rowSums(ICA_clusters)), gmm = as.factor(clusterings[['GMM']]),
wgcna = as.factor(clusterings[['WGCNA']]), manual = as.factor(clusterings[['Manual']]),
manual_mean = as.factor(clusterings[['Manual_mean']]), manual_sd = as.factor(clusterings[['Manual_sd']]),
SFARI = as.factor(clusterings[['SFARI_score']]), SFARI_bool = as.factor(clusterings[['SFARI_bool']]),
syndromic = as.factor(clusterings[['syndromic']])) %>%
bind_cols(DE_info[DE_info$ID %in% rownames(datExpr_redDim),]) %>%
mutate(avg_expr = log2(rowMeans(datExpr)+1)[rownames(datExpr) %in% rownames(datExpr_redDim)])
rownames(plot_points) = plot_points$ID
selectable_scatter_plot(plot_points, plot_points)